In [1]:
import pandas as pd
import numpy as np
import sys
%matplotlib inline

In [2]:
print 'Python version ' + sys.version
print 'Pandas version ' + pd.__version__


Python version 2.7.5 |Anaconda 2.1.0 (64-bit)| (default, Jul  1 2013, 12:37:52) [MSC v.1500 64 bit (AMD64)]
Pandas version 0.15.2

Compute

How to get the sum and length of a group?


In [3]:
df = pd.DataFrame({'group1':["a","a","b","b"],
                'value':[10,20,30,40]
                })
df


Out[3]:
group1 value
0 a 10
1 a 20
2 b 30
3 b 40

In [4]:
group = df.groupby('group1')
group.agg([len,sum])


Out[4]:
value
len sum
group1
a 2 30
b 2 70

How can I add a column that is equal to the sum of a group?


In [5]:
df = pd.DataFrame({'labels':["a","a","b","b"],
                'value':[10,20,30,40]
                })
df


Out[5]:
labels value
0 a 10
1 a 20
2 b 30
3 b 40

In [6]:
group = df.groupby('labels')['value']
df['value.sum'] = group.transform('sum')
df


Out[6]:
labels value value.sum
0 a 10 30
1 a 20 30
2 b 30 70
3 b 40 70

How to get the month name out of a date column?


In [7]:
df = pd.DataFrame({'col1':[pd.Timestamp('20130102000030'),
                         pd.Timestamp('2013-02-03 00:00:30'),
                         pd.Timestamp('3/4/2013 000030')]
                 })
df


Out[7]:
col1
0 2013-01-02 00:00:30
1 2013-02-03 00:00:30
2 2013-03-04 00:00:30

In [8]:
df['MonthNumber'] = df['col1'].apply(lambda x: x.month)
df['Day'] = df['col1'].apply(lambda x: x.day)
df['Year'] = df['col1'].apply(lambda x: x.year)
df['MonthName'] = df['col1'].apply(lambda x: x.strftime('%B'))
df['WeekDay'] = df['col1'].apply(lambda x: x.strftime('%A'))
df


Out[8]:
col1 MonthNumber Day Year MonthName WeekDay
0 2013-01-02 00:00:30 1 2 2013 January Wednesday
1 2013-02-03 00:00:30 2 3 2013 February Sunday
2 2013-03-04 00:00:30 3 4 2013 March Monday

How can I create a column based on two other columns?


In [9]:
df = pd.DataFrame({'col1':['minus','minus','positive','nan'],
                'col2':[10,20,30,40]
                })
df


Out[9]:
col1 col2
0 minus 10
1 minus 20
2 positive 30
3 nan 40

In [10]:
df['col3'] = df['col2']*df['col1'].apply(lambda x: -1 if x=='minus' else (1 if x=='positive' else np.nan))
df


Out[10]:
col1 col2 col3
0 minus 10 -10
1 minus 20 -20
2 positive 30 30
3 nan 40 NaN

How can I apply a function to a group and add the results to my original data frame?


In [11]:
df = pd.DataFrame({'group1':['a','a','a','b','b','b'],
                       'group2':['c','c','d','d','d','e'],
                       'value1':[1.1,2,3,4,5,6],
                       'value2':[7.1,8,9,10,11,12]
})

df


Out[11]:
group1 group2 value1 value2
0 a c 1.1 7.1
1 a c 2.0 8.0
2 a d 3.0 9.0
3 b d 4.0 10.0
4 b d 5.0 11.0
5 b e 6.0 12.0

In [12]:
group = df.groupby(['group1','group2'])

def Half(x):
    return x.sum()

df['new'] = group['value1'].transform(Half)
df


Out[12]:
group1 group2 value1 value2 new
0 a c 1.1 7.1 3.1
1 a c 2.0 8.0 3.1
2 a d 3.0 9.0 3.0
3 b d 4.0 10.0 9.0
4 b d 5.0 11.0 9.0
5 b e 6.0 12.0 6.0

In [13]:
# For multiple functions
def HalfPlus(x):
    return x.sum() + 1

newcol = group['value1'].agg([Half,HalfPlus])
newcol


Out[13]:
Half HalfPlus
group1 group2
a c 3.1 4.1
d 3.0 4.0
b d 9.0 10.0
e 6.0 7.0

In [14]:
df.merge(newcol, left_on=['group1','group2'], right_index=True)


Out[14]:
group1 group2 value1 value2 new Half HalfPlus
0 a c 1.1 7.1 3.1 3.1 4.1
1 a c 2.0 8.0 3.1 3.1 4.1
2 a d 3.0 9.0 3.0 3.0 4.0
3 b d 4.0 10.0 9.0 9.0 10.0
4 b d 5.0 11.0 9.0 9.0 10.0
5 b e 6.0 12.0 6.0 6.0 7.0

How to add two data frames and not get null values?


In [15]:
df1 = pd.DataFrame(data=[26371, 1755, 2], index=[-9999, 240, 138.99], columns=['value'])
df1


Out[15]:
value
-9999.00 26371
240.00 1755
138.99 2

In [16]:
df2 = pd.DataFrame(data=[26371, 1755, 6, 4], index=[-9999, 240, 113.03, 110], columns=['value'])
df2


Out[16]:
value
-9999.00 26371
240.00 1755
113.03 6
110.00 4

In [17]:
# If you simply add them, you will get null values
# were the index does not match
df1 + df2


Out[17]:
value
-9999.00 52742
110.00 NaN
113.03 NaN
138.99 NaN
240.00 3510

In [18]:
# Here we fix this issue
df1.add(df2, fill_value=0)


Out[18]:
value
-9999.00 52742
110.00 4
113.03 6
138.99 2
240.00 3510

Author: David Rojas